import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import plotly.express as px
import plotly.subplots as sp
from plotly.subplots import make_subplots
import plotly.graph_objects as go
df = pd.read_csv('diamonds.csv')
df
| Unnamed: 0 | carat | cut | color | clarity | depth | table | price | x | y | z | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | 0.23 | Ideal | E | SI2 | 61.5 | 55.0 | 326 | 3.95 | 3.98 | 2.43 |
| 1 | 2 | 0.21 | Premium | E | SI1 | 59.8 | 61.0 | 326 | 3.89 | 3.84 | 2.31 |
| 2 | 3 | 0.23 | Good | E | VS1 | 56.9 | 65.0 | 327 | 4.05 | 4.07 | 2.31 |
| 3 | 4 | 0.29 | Premium | I | VS2 | 62.4 | 58.0 | 334 | 4.20 | 4.23 | 2.63 |
| 4 | 5 | 0.31 | Good | J | SI2 | 63.3 | 58.0 | 335 | 4.34 | 4.35 | 2.75 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 53935 | 53936 | 0.72 | Ideal | D | SI1 | 60.8 | 57.0 | 2757 | 5.75 | 5.76 | 3.50 |
| 53936 | 53937 | 0.72 | Good | D | SI1 | 63.1 | 55.0 | 2757 | 5.69 | 5.75 | 3.61 |
| 53937 | 53938 | 0.70 | Very Good | D | SI1 | 62.8 | 60.0 | 2757 | 5.66 | 5.68 | 3.56 |
| 53938 | 53939 | 0.86 | Premium | H | SI2 | 61.0 | 58.0 | 2757 | 6.15 | 6.12 | 3.74 |
| 53939 | 53940 | 0.75 | Ideal | D | SI2 | 62.2 | 55.0 | 2757 | 5.83 | 5.87 | 3.64 |
53940 rows × 11 columns
df.drop(columns="Unnamed: 0", axis=1, inplace=True)
df
| carat | cut | color | clarity | depth | table | price | x | y | z | |
|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0.23 | Ideal | E | SI2 | 61.5 | 55.0 | 326 | 3.95 | 3.98 | 2.43 |
| 1 | 0.21 | Premium | E | SI1 | 59.8 | 61.0 | 326 | 3.89 | 3.84 | 2.31 |
| 2 | 0.23 | Good | E | VS1 | 56.9 | 65.0 | 327 | 4.05 | 4.07 | 2.31 |
| 3 | 0.29 | Premium | I | VS2 | 62.4 | 58.0 | 334 | 4.20 | 4.23 | 2.63 |
| 4 | 0.31 | Good | J | SI2 | 63.3 | 58.0 | 335 | 4.34 | 4.35 | 2.75 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 53935 | 0.72 | Ideal | D | SI1 | 60.8 | 57.0 | 2757 | 5.75 | 5.76 | 3.50 |
| 53936 | 0.72 | Good | D | SI1 | 63.1 | 55.0 | 2757 | 5.69 | 5.75 | 3.61 |
| 53937 | 0.70 | Very Good | D | SI1 | 62.8 | 60.0 | 2757 | 5.66 | 5.68 | 3.56 |
| 53938 | 0.86 | Premium | H | SI2 | 61.0 | 58.0 | 2757 | 6.15 | 6.12 | 3.74 |
| 53939 | 0.75 | Ideal | D | SI2 | 62.2 | 55.0 | 2757 | 5.83 | 5.87 | 3.64 |
53940 rows × 10 columns
df.duplicated().sum()
146
df.describe()
| carat | depth | table | price | x | y | z | |
|---|---|---|---|---|---|---|---|
| count | 53940.000000 | 53940.000000 | 53940.000000 | 53940.000000 | 53940.000000 | 53940.000000 | 53940.000000 |
| mean | 0.797940 | 61.749405 | 57.457184 | 3932.799722 | 5.731157 | 5.734526 | 3.538734 |
| std | 0.474011 | 1.432621 | 2.234491 | 3989.439738 | 1.121761 | 1.142135 | 0.705699 |
| min | 0.200000 | 43.000000 | 43.000000 | 326.000000 | 0.000000 | 0.000000 | 0.000000 |
| 25% | 0.400000 | 61.000000 | 56.000000 | 950.000000 | 4.710000 | 4.720000 | 2.910000 |
| 50% | 0.700000 | 61.800000 | 57.000000 | 2401.000000 | 5.700000 | 5.710000 | 3.530000 |
| 75% | 1.040000 | 62.500000 | 59.000000 | 5324.250000 | 6.540000 | 6.540000 | 4.040000 |
| max | 5.010000 | 79.000000 | 95.000000 | 18823.000000 | 10.740000 | 58.900000 | 31.800000 |
sns.scatterplot(x='carat', y='price', data=df, hue='cut')
<Axes: xlabel='carat', ylabel='price'>
sns.pairplot(df)
<seaborn.axisgrid.PairGrid at 0x17a58940850>
sns.scatterplot(x='cut', y='price', data=df)
<Axes: xlabel='cut', ylabel='price'>
sns.barplot(x='cut', y='price', data=df)
<Axes: xlabel='cut', ylabel='price'>
df50 = df.head(50)
df50
| carat | cut | color | clarity | depth | table | price | x | y | z | |
|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0.23 | Ideal | E | SI2 | 61.5 | 55.0 | 326 | 3.95 | 3.98 | 2.43 |
| 1 | 0.21 | Premium | E | SI1 | 59.8 | 61.0 | 326 | 3.89 | 3.84 | 2.31 |
| 2 | 0.23 | Good | E | VS1 | 56.9 | 65.0 | 327 | 4.05 | 4.07 | 2.31 |
| 3 | 0.29 | Premium | I | VS2 | 62.4 | 58.0 | 334 | 4.20 | 4.23 | 2.63 |
| 4 | 0.31 | Good | J | SI2 | 63.3 | 58.0 | 335 | 4.34 | 4.35 | 2.75 |
| 5 | 0.24 | Very Good | J | VVS2 | 62.8 | 57.0 | 336 | 3.94 | 3.96 | 2.48 |
| 6 | 0.24 | Very Good | I | VVS1 | 62.3 | 57.0 | 336 | 3.95 | 3.98 | 2.47 |
| 7 | 0.26 | Very Good | H | SI1 | 61.9 | 55.0 | 337 | 4.07 | 4.11 | 2.53 |
| 8 | 0.22 | Fair | E | VS2 | 65.1 | 61.0 | 337 | 3.87 | 3.78 | 2.49 |
| 9 | 0.23 | Very Good | H | VS1 | 59.4 | 61.0 | 338 | 4.00 | 4.05 | 2.39 |
| 10 | 0.30 | Good | J | SI1 | 64.0 | 55.0 | 339 | 4.25 | 4.28 | 2.73 |
| 11 | 0.23 | Ideal | J | VS1 | 62.8 | 56.0 | 340 | 3.93 | 3.90 | 2.46 |
| 12 | 0.22 | Premium | F | SI1 | 60.4 | 61.0 | 342 | 3.88 | 3.84 | 2.33 |
| 13 | 0.31 | Ideal | J | SI2 | 62.2 | 54.0 | 344 | 4.35 | 4.37 | 2.71 |
| 14 | 0.20 | Premium | E | SI2 | 60.2 | 62.0 | 345 | 3.79 | 3.75 | 2.27 |
| 15 | 0.32 | Premium | E | I1 | 60.9 | 58.0 | 345 | 4.38 | 4.42 | 2.68 |
| 16 | 0.30 | Ideal | I | SI2 | 62.0 | 54.0 | 348 | 4.31 | 4.34 | 2.68 |
| 17 | 0.30 | Good | J | SI1 | 63.4 | 54.0 | 351 | 4.23 | 4.29 | 2.70 |
| 18 | 0.30 | Good | J | SI1 | 63.8 | 56.0 | 351 | 4.23 | 4.26 | 2.71 |
| 19 | 0.30 | Very Good | J | SI1 | 62.7 | 59.0 | 351 | 4.21 | 4.27 | 2.66 |
| 20 | 0.30 | Good | I | SI2 | 63.3 | 56.0 | 351 | 4.26 | 4.30 | 2.71 |
| 21 | 0.23 | Very Good | E | VS2 | 63.8 | 55.0 | 352 | 3.85 | 3.92 | 2.48 |
| 22 | 0.23 | Very Good | H | VS1 | 61.0 | 57.0 | 353 | 3.94 | 3.96 | 2.41 |
| 23 | 0.31 | Very Good | J | SI1 | 59.4 | 62.0 | 353 | 4.39 | 4.43 | 2.62 |
| 24 | 0.31 | Very Good | J | SI1 | 58.1 | 62.0 | 353 | 4.44 | 4.47 | 2.59 |
| 25 | 0.23 | Very Good | G | VVS2 | 60.4 | 58.0 | 354 | 3.97 | 4.01 | 2.41 |
| 26 | 0.24 | Premium | I | VS1 | 62.5 | 57.0 | 355 | 3.97 | 3.94 | 2.47 |
| 27 | 0.30 | Very Good | J | VS2 | 62.2 | 57.0 | 357 | 4.28 | 4.30 | 2.67 |
| 28 | 0.23 | Very Good | D | VS2 | 60.5 | 61.0 | 357 | 3.96 | 3.97 | 2.40 |
| 29 | 0.23 | Very Good | F | VS1 | 60.9 | 57.0 | 357 | 3.96 | 3.99 | 2.42 |
| 30 | 0.23 | Very Good | F | VS1 | 60.0 | 57.0 | 402 | 4.00 | 4.03 | 2.41 |
| 31 | 0.23 | Very Good | F | VS1 | 59.8 | 57.0 | 402 | 4.04 | 4.06 | 2.42 |
| 32 | 0.23 | Very Good | E | VS1 | 60.7 | 59.0 | 402 | 3.97 | 4.01 | 2.42 |
| 33 | 0.23 | Very Good | E | VS1 | 59.5 | 58.0 | 402 | 4.01 | 4.06 | 2.40 |
| 34 | 0.23 | Very Good | D | VS1 | 61.9 | 58.0 | 402 | 3.92 | 3.96 | 2.44 |
| 35 | 0.23 | Good | F | VS1 | 58.2 | 59.0 | 402 | 4.06 | 4.08 | 2.37 |
| 36 | 0.23 | Good | E | VS1 | 64.1 | 59.0 | 402 | 3.83 | 3.85 | 2.46 |
| 37 | 0.31 | Good | H | SI1 | 64.0 | 54.0 | 402 | 4.29 | 4.31 | 2.75 |
| 38 | 0.26 | Very Good | D | VS2 | 60.8 | 59.0 | 403 | 4.13 | 4.16 | 2.52 |
| 39 | 0.33 | Ideal | I | SI2 | 61.8 | 55.0 | 403 | 4.49 | 4.51 | 2.78 |
| 40 | 0.33 | Ideal | I | SI2 | 61.2 | 56.0 | 403 | 4.49 | 4.50 | 2.75 |
| 41 | 0.33 | Ideal | J | SI1 | 61.1 | 56.0 | 403 | 4.49 | 4.55 | 2.76 |
| 42 | 0.26 | Good | D | VS2 | 65.2 | 56.0 | 403 | 3.99 | 4.02 | 2.61 |
| 43 | 0.26 | Good | D | VS1 | 58.4 | 63.0 | 403 | 4.19 | 4.24 | 2.46 |
| 44 | 0.32 | Good | H | SI2 | 63.1 | 56.0 | 403 | 4.34 | 4.37 | 2.75 |
| 45 | 0.29 | Premium | F | SI1 | 62.4 | 58.0 | 403 | 4.24 | 4.26 | 2.65 |
| 46 | 0.32 | Very Good | H | SI2 | 61.8 | 55.0 | 403 | 4.35 | 4.42 | 2.71 |
| 47 | 0.32 | Good | H | SI2 | 63.8 | 56.0 | 403 | 4.36 | 4.38 | 2.79 |
| 48 | 0.25 | Very Good | E | VS2 | 63.3 | 60.0 | 404 | 4.00 | 4.03 | 2.54 |
| 49 | 0.29 | Very Good | H | SI2 | 60.7 | 60.0 | 404 | 4.33 | 4.37 | 2.64 |
sns.scatterplot(x='cut', y='price', data=df50)
<Axes: xlabel='cut', ylabel='price'>
df_num = df[['carat', 'table', 'x', 'y', 'z', 'price']].corr()
sns.heatmap(df_num, annot=True)
<Axes: >
sns.scatterplot(x='carat', y='price', data=df, hue='color')
<Axes: xlabel='carat', ylabel='price'>
sns.countplot(x='color', data=df)
<Axes: xlabel='color', ylabel='count'>
sns.countplot(x='cut', data=df)
<Axes: xlabel='cut', ylabel='count'>
scatter_3d_fig = px.scatter_3d(df, x='carat', y='cut', z='price', color='cut',
title='3D Scatter Plot of Carat, Cut, and Price',
labels={'carat': 'Carat', 'cut': 'Cut Quality', 'price': 'Price'})
scatter_3d_fig.show()
scatter_3d_fig = px.scatter_3d(df, x='carat', y='color', z='price', color='color',
title='3D Scatter Plot of Carat, Cut, and Price',
labels={'carat': 'Carat', 'color': 'Color', 'price': 'Price'})
scatter_3d_fig.show()
scatter_fig = px.scatter(df, x='carat', y='price', animation_frame='cut', color='cut',
title='Animated Scatter Plot: Carat vs Price Colored by Cut')
scatter_fig.show()